Load the met data from https://raw.githubusercontent.com/USCbiostats/data-science-data/master/02_met/met_all.gz, and also the station data. For the later, you can use the code we used during lecture to pre-process the stations data:
# load the met datamet <- data.table::fread(file.path("~", "Downloads", "met_all.gz"))
# download the stations datastations <-fread("https://noaa-isd-pds.s3.amazonaws.com/isd-history.csv")stations[, USAF :=as.integer(USAF)]
Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
# dealing with NAs and 999999stations[, USAF :=fifelse(USAF ==999999, NA_integer_, USAF)]stations[, CTRY :=fifelse(CTRY =="", NA_character_, CTRY)]stations[, STATE :=fifelse(STATE =="", NA_character_, STATE)]# selecting the three relevant columns, and keeping unique recordsstations <-unique(stations[, list(USAF, CTRY, STATE)])# dropping NAsstations <- stations[!is.na(USAF)]# removing duplicatesstations[, n :=1:.N, by = .(USAF)]stations <- stations[n ==1,][, n :=NULL]
Merge the data as we did during the lecture.
# merging the met and stations data combined_data <-merge(# datax = met, y = stations, # list of variables to matchby.x ="USAFID",by.y ="USAF", # which obs to keep?all.x =TRUE, all.y =FALSE )
Question 1: Representative station for the US
What is the median station in terms of temperature, wind speed, and atmospheric pressure? Look for the three weather stations that best represent continental US using the quantile() function. Do these three coincide?
# calculating medians by station for temp, wind speed, and atm pressuremedians_USAFID <-aggregate(cbind(temp, wind.sp, atm.press) ~ USAFID + STATE, data = combined_data, FUN = median, na.rm =TRUE)# calculating the national medians for temperature, wind speed, and atm pressuremedian_temp <-quantile(combined_data$temp, 0.5, na.rm =TRUE)median_wind <-quantile(combined_data$wind.sp, 0.5, na.rm =TRUE)median_press <-quantile(combined_data$atm.press, 0.5, na.rm =TRUE)# Finding the stations closest to the median for each variableclosest_temp <- medians_USAFID[which.min(abs(medians_USAFID$temp - median_temp)), ]closest_wind <- medians_USAFID[which.min(abs(medians_USAFID$wind.sp - median_wind)), ]closest_press <- medians_USAFID[which.min(abs(medians_USAFID$atm.press - median_press)), ]# printing the closest stationsclosest_temp
USAFID STATE temp wind.sp atm.press
77 722860 CA 23.55 1.5 1012.5
closest_wind
USAFID STATE temp wind.sp atm.press
5 722235 AL 27.8 2.1 1015.1
closest_press
USAFID STATE temp wind.sp atm.press
109 723830 CA 23.3 5.1 1014.1
# checking if the three stations coincide (i.e., are the same)stations_coincide <-identical(closest_temp$USAFID, closest_wind$USAFID) &&identical(closest_wind$USAFID, closest_press$USAFID)# output whether the stations coincidestations_coincide
[1] FALSE
The analysis of the median weather stations for temperature, wind speed, and atmospheric pressure reveals that the stations do not coincide. The median values for the data are 23.5°C for temperature, 2.1 m/s for wind speed, and 1014 hPa for atmospheric pressure. The closest station to the median temperature is USAFID 722860 located in California, with a temperature of 23.55°C. For wind speed, USAFID 722235 in Alabama best matches the median, with an exact wind speed of 2.1 m/s. Lastly, USAFID 723830, also in California, aligns with the median atmospheric pressure of 1014 hPa. Therefore, the weather stations representing the median values for temperature, wind speed, and atmospheric pressure are from different locations, and they do not coincide.
Question 2: Representative station per state
Just like the previous question, you are asked to identify what is the most representative, the median, station per state. This time, instead of looking at one variable at a time, look at the euclidean distance. If multiple stations show in the median, select the one located at the lowest latitude.
# find the medians of each state for temp, wind speed, and pressuremedians_by_state <- combined_data |>group_by(STATE) |>summarize(median_temp =median(temp, na.rm =TRUE),median_wind =median(wind.sp, na.rm =TRUE),median_press =median(atm.press, na.rm =TRUE) )# find the most representative stations using Euclidean distance per stateclosest_usafid_by_state <- combined_data |>group_by(STATE, USAFID) |>summarize(temp =median(temp, na.rm =TRUE),wind.sp =median(wind.sp, na.rm =TRUE),atm.press =median(atm.press, na.rm =TRUE),lat =median(lat, na.rm =TRUE) # Include latitude for tie-breaking ) |>left_join(medians_by_state, by ="STATE") |>mutate(# Calculate the Euclidean distance for each station to the state's medianeuclidean_dist =sqrt( (temp - median_temp)^2+ (wind.sp - median_wind)^2+ (atm.press - median_press)^2 ) ) |>group_by(STATE) |># In case of ties (stations with the same distance), pick the one with the lowest latitudeslice_min(order_by = euclidean_dist, with_ties =TRUE) |>slice_min(order_by = lat) |>select(STATE, USAFID, euclidean_dist, lat)
`summarise()` has grouped output by 'STATE'. You can override using the
`.groups` argument.
# print the resultclosest_usafid_by_state
# A tibble: 48 × 4
# Groups: STATE [48]
STATE USAFID euclidean_dist lat
<chr> <int> <dbl> <dbl>
1 AL 723235 0.300 34.7
2 AR 723417 0.412 34.2
3 AZ 722745 0.860 32.2
4 CA 722931 0.502 32.9
5 CO 724676 0.837 39.2
6 CT 725087 0.5 41.7
7 DE 724180 0.200 39.7
8 FL 722108 0.206 26.5
9 GA 722195 0.412 33.8
10 IA 725450 0.447 41.9
# ℹ 38 more rows
Question 3: In the middle?
For each state, identify what is the station that is closest to the mid-point of the state. Combining these with the stations you identified in the previous question, use leaflet() to visualize all ~100 points in the same figure, applying different colors for those identified in this question.
# calculating the midpoints for each statestate_midpoints <- combined_data |>group_by(STATE) |>summarize(mid_lat =mean(lat, na.rm =TRUE), # Replace with your latitude column namemid_lon =mean(lon, na.rm =TRUE), # Replace with your longitude column name.groups ='drop' )# finding the closest station to each state midpointclosest_to_midpoint <- state_midpoints |>rowwise() |>mutate(USAFID = combined_data$USAFID[which.min(sqrt((combined_data$lat - mid_lat)^2+ (combined_data$lon - mid_lon)^2))] ) |>ungroup()# combining the closest stations for graphingcombined_stations <- closest_to_midpoint |>left_join(combined_data |>select(USAFID, lat, lon), by ="USAFID")# graphing using leafletlibrary(leaflet)# leaflet mapmap <-leaflet() |>addTiles() # Add OpenStreetMap tiles# adding points for closest stations to midpointsmap <- map |>addCircleMarkers(data = combined_stations,lng =~lon, lat =~lat,color ="turquoise", # Color for closest stations to midpointsradius =5,popup =~paste("Station ID:", USAFID),group ="Closest Stations")# adding points for actual state midpointsmap <- map |>addCircleMarkers(data = state_midpoints,lng =~mid_lon, lat =~mid_lat,color ="pink", # Color for actual state midpointsradius =7,label =~paste("State Midpoint:", STATE),group ="State Midpoints")# adding layer control to toggle visibilitymap <- map |>addLayersControl(overlayGroups =c("Closest Stations", "State Midpoints"),options =layersControlOptions(collapsed =FALSE) )# showing the mapmap
Question 4: Means of means
Using the quantile() function, generate a summary table that shows the number of states included, average temperature, wind-speed, and atmospheric pressure by the variable “average temperature level,” which you’ll need to create.
Start by computing the states’ average temperature. Use that measurement to classify them according to the following criteria:
low: temp < 20
Mid: temp >= 20 and temp < 25
High: temp >= 25
Once you are done with that, you can compute the following:
Number of entries (records),
Number of NA entries,
Number of stations,
Number of states included, and
Mean temperature, wind-speed, and atmospheric pressure.
All by the levels described before.
# calculating the average temperature for each statestate_avg_temp <- combined_data |>group_by(STATE) |>summarize(avg_temp =mean(temp, na.rm =TRUE),avg_wind =mean(wind.sp, na.rm =TRUE),avg_press =mean(atm.press, na.rm =TRUE),num_entries =n(),num_na =sum(is.na(temp)), # Count NA entries for temperature.groups ='drop' )# classifying the states into "low," "mid," and "high" temperature levelsstate_avg_temp <- state_avg_temp |>mutate(temp_level =case_when( avg_temp <20~"Low", avg_temp >=20& avg_temp <25~"Mid", avg_temp >=25~"High" ) )#generating a summary table summary_table <- state_avg_temp |>group_by(temp_level) |>summarize(num_states =n(),num_stations =n_distinct(STATE), # Number of unique statestotal_entries =sum(num_entries), # Total number of entriestotal_na =sum(num_na), # Total number of NA entriesavg_temp =mean(avg_temp, na.rm =TRUE),avg_wind =mean(avg_wind, na.rm =TRUE),avg_press =mean(avg_press, na.rm =TRUE),.groups ='drop' )# printing the summary tablesummary_table